By Zhonghao Zhao and Bingxue An, under advise of Professor Bo Li
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import math
import warnings
warnings.filterwarnings('ignore')
import sys
!{sys.executable} -m pip install pandas-profiling
import pandas_profiling
# read data
data_number = [1,2,3]
dataset = pd.read_csv("DATA/"+ "AIS_Challenge_Problem_Set_0_"+ str(1) + "_withVID.csv")
dataset_2 = pd.read_csv("DATA/"+ "AIS_Challenge_Problem_Set_0_"+ str(2) + "_withVID.csv")
dataset_3 = pd.read_csv("DATA/"+ "AIS_Challenge_Problem_Set_0_"+ str(1) + "_withVID.csv")
dataset.info()
dataset.head(5)
dataset.head(20)
#check whether there is missing values in the data
dataset.isnull().sum()
#summary of the data
#pandas_profiling.ProfileReport(dataset)
# A method to convert HH-MM-SS to seconds
def get_sec(time_str):
h, m, s = time_str.split(':')
return int(h) * 3600 + int(m) * 60 + int(s)
# a node class for each data
class Vessel_node:
def __init__(self, object_id, time, lat, lon, speed, direction, real_VID):
self.OBJECT_ID = object_id
self.VID = -1
self.Time = time
self.LAT = lat
self.LON = lon
self.Speed = speed*0.514444
self.Direction = direction
self.real_vid = real_VID
def set_VID(self, vid):
self.VID = vid
def get_ID(self):
return self.OBJECT_ID
def get_real_vid(self):
return self.real_vid
def get_LAT(self):
return self.LAT
def get_LON(self):
return self.LON
def get_Time(self):
return self.Time
def get_SPEED(self):
return self.Speed
def get_DIRECTION(self):
return self.Direction
# a track class to keep all tracks
class All_tracks:
def __init__(self):
self.tracks = []
self.total_track = 0
self.node_tracks = []
def get_total_track(self):
return self.total_track
def add_track(self, node):
new_track = [node.get_ID()]
new_node_track = [node]
self.total_track += 1
node.set_VID(self.total_track)
self.tracks.append(new_track)
self.node_tracks.append(new_node_track)
def add_node(self, track_id, node):
node.set_VID(track_id)
self.tracks[track_id - 1].append(node.get_ID())
self.node_tracks[track_id - 1].append(node)
def print_track(self):
for track in self.tracks:
print(track)
print('\n')
def print_track_number(self):
print(self.total_track)
def get_node_track(self):
return self.node_tracks
def get_track_number(self):
return self.total_track
def get_tracks(self):
return self.tracks
'''The initial assumption is the accleration is constant between two nodes, use v_1-v_0 / (t_1-t_0) to calculate
the accleration, use v_0*t + 0.5*a*t^2 to calculate the desired final x, y location, then compare it with the final
x, y location, calculate the desired distance difference sqrt (x_expect-x)^2 + (y_expect-y)^2 for every track, compare
it with the actual distane, using actual lon and lat to calculate
'''
def calculate_distance(lat2, lon2, lat1, lon1):
R = 6371000; #earth radian
l_1 = math.radians(lat1)
l_2 = math.radians(lat2)
lat_diff = math.radians(lat2-lat1)
lon_diff = math.radians(lon2-lon1)
temp = math.sin(lat_diff/2)**2 + math.cos(l_1) * math.cos(l_2) * math.sin(lon_diff/2)**2
c = 2 * math.atan2(math.sqrt(temp), math.sqrt(1-temp))
d = R * c
return d
''' This function will return the distance error between two nodes with the input of these two nodes
distance error is defined as the square of error of (expected distance - actual distance)
'''
def calculate_DE(node_a, node_b):
a = node_a
b = node_b
angle_1 = math.radians(a.get_DIRECTION())
angle_2 = math.radians(b.get_DIRECTION())
v_diff_x = b.get_SPEED()*math.cos(angle_2) - a.get_SPEED()*math.cos(angle_1)
v_diff_y = b.get_SPEED()*math.sin(angle_2) - a.get_SPEED()*math.sin(angle_1)
t_diff = b.get_Time()- a.get_Time()
acc_x = v_diff_x / t_diff
acc_y = v_diff_y / t_diff
lon_diff_expect = a.get_SPEED()*math.cos(angle_1)*t_diff + 0.5*acc_x*t_diff**2
lat_diff_expect = a.get_SPEED()*math.sin(angle_1)*t_diff + 0.5*acc_y*t_diff**2
total_diff = math.sqrt(lon_diff_expect**2 + lat_diff_expect**2)
d = calculate_distance(b.get_LAT(), b.get_LON(), a.get_LAT(), b.get_LON())
error = (abs(d)-abs(total_diff))**2
return error
'''Organize new training dataset for ML, determing when to start new track
1 stands the two nodes are in same track, 0 stands not.
'''
vid = []
train_tracker = All_tracks()
errors = []
outcome = []
prev_row = 0
prev_node = 0
for index, row in dataset.iterrows():
#sequencial compare in general
cur_row = row.tolist()
cur_node = Vessel_node(cur_row[0],get_sec(cur_row[2]), cur_row[3], cur_row[4], cur_row[5]/10, cur_row[6]/10, cur_row[1])
cur_vid = cur_row[1]
if(prev_row):
if cur_vid != prev_row[1] and cur_node.get_Time() != prev_node.get_Time():
temp_error = calculate_DE(prev_node, cur_node)
errors.append(temp_error)
outcome.append(0)
prev_row = cur_row
prev_node = cur_node
#sequencial compare in same track
if(vid and (cur_vid in vid)):
node_tracks = train_tracker.get_node_track()
for i in range(train_tracker.get_track_number()):
cur_track = node_tracks[i]
last = cur_track[-1]
if(last.get_real_vid() == cur_vid):
train_tracker.add_node(i+1, cur_node)
temp_error = calculate_DE(last, cur_node)
errors.append(temp_error)
outcome.append(1)
else:
continue
if(cur_vid not in vid):
vid.append(cur_vid)
train_tracker.add_track(cur_node)
"""
see whether we get a balanced training dataset
"""
count_0 = 0
count_1 = 0
for item in outcome:
if item == 0:
count_0 += 1
else:
count_1 += 1
print(count_0, count_1)
#k fold cross validation, k = 10
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier # Import Random Forest Classifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
X = errors
y = outcome
kf = KFold(n_splits=10)
kf.get_n_splits(X)
KFold(n_splits=10, random_state=None, shuffle=False)
decision_scores = []
best_dicision = 0
randomForest_scores = []
best_randomForest = 0
logistic_scores = []
best_logistic = 0
for train_index, test_index in kf.split(X):
X_train, X_test = [[X[i]] for i in train_index], [[X[i]] for i in test_index]
y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
#decision tree
dec = DecisionTreeClassifier()
dec = dec.fit(X_train,y_train)
y_pred_d = dec.predict(X_test)
current_score_d = metrics.accuracy_score(y_test, y_pred_d)
if not decision_scores or current_score_d > max(decision_scores):
best_dicision = dec
decision_scores.append(current_score_d)
#random forest
ran = RandomForestClassifier(n_estimators=10)
ran = ran.fit(X_train,y_train)
y_pred_r = ran.predict(X_test)
current_score_r = metrics.accuracy_score(y_test, y_pred_r)
if not randomForest_scores or current_score_r > max(randomForest_scores):
best_randomForest = ran
randomForest_scores.append(current_score_r)
#logistic regression
logreg = LogisticRegression()
logreg = logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)
current_score_l = metrics.accuracy_score(y_test, y_pred)
if not logistic_scores or current_score_l > max(decision_scores):
best_logistic = logreg
logistic_scores.append(current_score_l)
print(decision_scores)
print(randomForest_scores)
print(logistic_scores)
def Average(lst):
return sum(lst) / len(lst)
print(Average(decision_scores))
print(Average(randomForest_scores))
print(Average(logistic_scores))
#visualize decision tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(best_dicision, out_file=dot_data)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())